library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(gridExtra)
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.2
wwine=read.csv("/Users/daolinghuang/Documents/datascience/UdacityNano/DataAnalysis/Project2_WhiteWine/wineQualityWhites.csv")
This dataset is public available for research. The details are described in [Cortez et al., 2009]. Please include this citation if you plan to use this database: P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236. Available at: [@Elsevier] http://dx.doi.org/10.1016/j.dss.2009.05.016 [Pre-press (pdf)] http://www3.dsi.uminho.pt/pcortez/winequality09.
pdf [bib] http://www3.dsi.uminho.pt/pcortez/dss09.bib
*Input variables (based on physicochemical tests): + 1 - fixed acidity (tartaric acid - g / dm^3) + 2 - volatile acidity (acetic acid - g / dm^3) + 3 - citric acid (g / dm^3) + 4 - residual sugar (g / dm^3) + 5 - chlorides (sodium chloride - g / dm^3 + 6 - free sulfur dioxide (mg / dm^3) + 7 - total sulfur dioxide (mg / dm^3) + 8 - density (g / cm^3) + 9 - pH + 10 - sulphates (potassium sulphate - g / dm3) + 11 - alcohol (% by volume) Output variable (based on sensory data): + 12 - quality (score between 0 and 10)
dim(wwine)
## [1] 4898 13
names(wwine)
## [1] "X" "fixed.acidity" "volatile.acidity"
## [4] "citric.acid" "residual.sugar" "chlorides"
## [7] "free.sulfur.dioxide" "total.sulfur.dioxide" "density"
## [10] "pH" "sulphates" "alcohol"
## [13] "quality"
head(wwine)
## X fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.0 0.27 0.36 20.7 0.045
## 2 2 6.3 0.30 0.34 1.6 0.049
## 3 3 8.1 0.28 0.40 6.9 0.050
## 4 4 7.2 0.23 0.32 8.5 0.058
## 5 5 7.2 0.23 0.32 8.5 0.058
## 6 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
wwine$quality=factor(wwine$quality)
for (i in 2:dim(wwine)[2])
{print (names(wwine)[i])
print(summary(wwine[,i]))}
## [1] "fixed.acidity"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.800 6.300 6.800 6.855 7.300 14.200
## [1] "volatile.acidity"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0800 0.2100 0.2600 0.2782 0.3200 1.1000
## [1] "citric.acid"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2700 0.3200 0.3342 0.3900 1.6600
## [1] "residual.sugar"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.600 1.700 5.200 6.391 9.900 65.800
## [1] "chlorides"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00900 0.03600 0.04300 0.04577 0.05000 0.34600
## [1] "free.sulfur.dioxide"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 23.00 34.00 35.31 46.00 289.00
## [1] "total.sulfur.dioxide"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.0 108.0 134.0 138.4 167.0 440.0
## [1] "density"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9871 0.9917 0.9937 0.9940 0.9961 1.0390
## [1] "pH"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.720 3.090 3.180 3.188 3.280 3.820
## [1] "sulphates"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2200 0.4100 0.4700 0.4898 0.5500 1.0800
## [1] "alcohol"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.40 10.51 11.40 14.20
## [1] "quality"
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
As shown in the Introduction, the quality is scored between 0 (very bad) and 10 (very excellent). The current data set contains 7 levels with level 6 is most abundent. There are 12 chemical property variables and we are going to explore potential relationships between wine quality and the 12 chemical property. We are looking for key variables which can be used to evaluate the win quality.
ggplot(aes(x=quality),data=wwine)+geom_histogram(fill='red',color='black',stat='count')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
The quality distribution is generally approximately normal with quality level 6 to be the peak.
p1=ggplot(aes(x=fixed.acidity),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(2,12,1),limit=c(2,12))
p2=ggplot(aes(x=volatile.acidity),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,0.9,0.1),limits=c(0,0.9))
p3=ggplot(aes(x=residual.sugar),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,25,5),limits=c(0,25))
p4=ggplot(aes(x=chlorides),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,0.1,0.01),limits=c(0,0.1))
p5=ggplot(aes(x=free.sulfur.dioxide),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,90,10),limits=c(0,90))
p6=ggplot(aes(x=total.sulfur.dioxide),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,300,50),limits=c(0,300))
p7=ggplot(aes(x=density),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0.98,1.01,0.01),limits=c(0.98,1.01))
p8=ggplot(aes(x=pH),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(2.7,3.9,0.2),limits=c(2.7,3.9))
p9=ggplot(aes(x=sulphates),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(0,1.2,0.1),limits=c(0,1.2))
p10=ggplot(aes(x=alcohol),data=wwine)+geom_histogram(color='blue')+scale_x_continuous(breaks=seq(8,14,1),limits=c(8,14))
grid.arrange(p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 110 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
It is apparent that the residual.sugar and alcohol distributions are skew while all other variable distributions are normal when outliers are omited. So the residual.sugar and alcohol distributions can be transformed.
p3_new=ggplot(aes(x=residual.sugar),data=wwine)+geom_histogram(color='yellow')+scale_x_log10()
p10_new=ggplot(aes(x=alcohol),data=wwine)+geom_histogram(color='yellow')+scale_x_log10()
grid.arrange(p3_new,p10_new,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Still they are not normal distributed.
It is a csv file with 4898 records and 13 variables.These 13 variables are Number, fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol and quality. These variables are all numeric numbers or integers. But the quality variable should be converted to factor for its better representation.
Quality is definitely a main feature because I am investigating possible chemical properties can affect it. However, from 1-D plots, I can’t tell which of the other chemical property variables weigh more than all others. I read the information for each variable, the acidity, sugar, alcohol and sulfur dioxide can all affect the taste of wine. So I assume all may be possible at this point. To figure which has more influence, I need to go to multivariate analysis.
investigation into your feature(s) of interest? Not clear yet.
NO. But I transformed two variables,residual.sugar and alcohol, using scale_x_log10 function because they are skewed.
Did you perform any operations on the data to tidy, adjust, or change the form
of the data? If so, why did you do this?
I transformed two variables,residual.sugar and alcohol, using scale_x_log10 function because they are skewed. After transformation, they are not skewed but they are not normal distributed, either.
set.seed(4538)
ggpairs(wwine[sample.int(nrow(wwine),1000),])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
b1=ggplot(aes(x=quality,y=fixed.acidity),data=wwine)+geom_boxplot()
b2=ggplot(aes(x=quality,y=volatile.acidity),data=wwine)+geom_boxplot()
b3=ggplot(aes(x=quality,y=residual.sugar),data=wwine)+geom_boxplot()
b4=ggplot(aes(x=quality,y=chlorides),data=wwine)+geom_boxplot()
b5=ggplot(aes(x=quality,y=free.sulfur.dioxide),data=wwine)+geom_boxplot()
b6=ggplot(aes(x=quality,y=total.sulfur.dioxide),data=wwine)+geom_boxplot()
b7=ggplot(aes(x=quality,y=density),data=wwine)+geom_boxplot()
b8=ggplot(aes(x=quality,y=pH),data=wwine)+geom_boxplot()
b9=ggplot(aes(x=quality,y=sulphates),data=wwine)+geom_boxplot()
b10=ggplot(aes(x=quality,y=alcohol),data=wwine)+geom_boxplot()
grid.arrange(b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,ncol=3)
Apparently, the boxplot between quality and all other variables are available.Since the boxplots in ggpairs are too small to see details,I replot the boxplots separately. From these boxplots, we can see the plots of residual. sugar, density and alcohol vs quality have relatively few outliers while volatile and chlorides vs quality have a lot compared to their narrow quartile. Basically,I can’t see a simple linear relationship between quality and any chemical property variable. From the ggpairs table, the correlation coefficients between each pair of chemical property variables are obtained. It is noticable that that there are strong correlations between density and residual.sugar (0.830) and alcohol and density (-0.799), median correlations between total.sulfur.dioxide and free.sulfur.oxide (0.603) and total.sulfur.dioxide and density (0.539), and small correlations between fixed acid and pH (-0.46), alcohol and residual.sugar (-0.448),total.sulfur. dioxide and alcohol (-0.441),total.sulfur.dioxide and residual.sugar (0.421), and alcohol and chloride (-0.349). In other words, the four variables, density, alcohol, residual.sugar and total.sulfur.dioxides have more relations. I’d like to dig these more with quality as the facet.
ggplot(aes(x=residual.sugar,y=density),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=alcohol,y=density),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=free.sulfur.dioxide,y=total.sulfur.dioxide),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=density,y=total.sulfur.dioxide),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=fixed.acidity,y=pH),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=alcohol,y=total.sulfur.dioxide),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=residual.sugar,y=alcohol),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=residual.sugar,y=total.sulfur.dioxide),data=wwine)+geom_point()+facet_wrap(~quality)
ggplot(aes(x=alcohol,y=chlorides),data=wwine)+geom_point()+facet_wrap(~quality)
It is noticable that that there are strong correlations between density and residual.sugar (0.830) and alcohol and density (-0.799), median correlations between total.sulfur.dioxide and free.sulfur.oxide (0.603) and total.sulfur.dioxide and density (0.539), and small correlations between fixed acid and pH (-0.46), alcohol and residual.sugar (-0.448),total.sulfur. dioxide and alcohol (-0.441),total.sulfur.dioxide and residual.sugar (0.421), and alcohol and chloride (-0.349). In other words, the four variables, density, alcohol, residual.sugar and total.sulfur.dioxides have more relations. ### What was the strongest relationship you found?
The correlation between density and residual.sugar (0.830)
ggplot(aes(x=residual.sugar,y=density),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=alcohol,y=density),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=free.sulfur.dioxide,y=total.sulfur.dioxide),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=density,y=total.sulfur.dioxide),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=fixed.acidity,y=pH),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=alcohol,y=total.sulfur.dioxide),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=residual.sugar,y=alcohol),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=residual.sugar,y=total.sulfur.dioxide),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
ggplot(aes(x=alcohol,y=chlorides),data=wwine)+geom_point(aes(color=quality))+geom_smooth(method='lm')
Instead of treating quality as a facet, I can also treat it as a color variable and put it inside the plots. But to me, there is no clear clue to evaluate the quality.
So I search online for a way to evaluate a wine. from http://winefolly.com/review/wine-characteristics/, I know the five characteristics of wine: sweetness, acidity, tannin, fruit and body. Based on this information, I propose a simple model to apply to the data set. Apparently, there is no variable associated with tannin, fruit or body in current data set. So I will focus on the balance of sweetness and acidity. In a simple model, sweetness can be characterisic by residual.sugar. However, acidity is confused with alcohol and higher alcohol leads to higher acidity.But I haven’t found a good way to scale the alcohol to acidity, so I will just add the acid up. Then in total, the balanced acidity=fixed.acidity+volatile.acidity+citric.acid-residual.sugar
wwine$balance=with(wwine, fixed.acidity+volatile.acidity+citric.acid-residual.sugar)
ggplot(aes(x=quality,y=balance),data=wwine)+geom_point(stat='summary',fun.y='median',shape=4)
Unfortunately, not really. I guess the wine quality evaluation may be a more complicated thing that requires balances on different chemical properties.
I haven’t found any.
Even with a new model, the analysis goes nowhere.
Tip: You’ve done a lot of exploration and have built up an understanding of the structure of and relationships between the variables in your dataset. Here, you will select three plots from all of your previous exploration to present here as a summary of some of your most interesting findings. Make sure that you have refined your selected plots for good titling, axis labels (with units), and good aesthetic choices (e.g. color, transparency). After each plot, make sure you justify why you chose each plot by describing what it shows.
p3_new=ggplot(aes(x=residual.sugar),data=wwine)+geom_histogram(color='blue')+scale_x_log10()+xlab('Residual Sugar(g/dm^3)') +ylab('Count') +ggtitle('Transformed Distribution of Residual Sugar')+theme(plot.title=element_text(face='bold',size=12))
p10_new=ggplot(aes(x=alcohol),data=wwine)+geom_histogram(color='red')+scale_x_log10()+xlab('Alcohol(% by volume)') +ylab('Count') +ggtitle('Transformed Distribution of Alcohol')+theme(plot.title=element_text(face='bold',size=12))
grid.arrange(p3_new,p10_new,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(aes(x=residual.sugar,y=density),data=wwine)+geom_point()+facet_wrap(~quality)+xlab('Residual Sugar(g/dm^3)') +ylab('Density(g/cm^3)') +ggtitle('Density vs Residual Sugar')+theme(plot.title=element_text(face='bold',size=12))+scale_x_continuous(limits=c(0,20),breaks=seq(0,20,1))+scale_y_continuous(limits=c(0.986,1.004),breaks=seq(0.986,1.004,0.002))
## Warning: Removed 18 rows containing missing values (geom_point).
http://127.0.0.1:14728/graphics/plot_zoom_png?width=494&height=672 ### Plot Three
wwine$balance=with(wwine, fixed.acidity+volatile.acidity+citric.acid-residual.sugar)
ggplot(aes(x=quality,y=balance),data=wwine)+geom_point(stat='summary',fun.y='median',shape=4)+xlab('Quality') +ylab('Balance (g/dm^3)') +ggtitle('Balance vs Quality')+theme(plot.title=element_text(face='bold',size=12))